# Data manipulation and visualization
library(tidyverse)
library(here)
library(janitor)
# Data analysis/statistics
library(car) #ANOVA
library(ggResidpanel) #residual panel
library(ggeffects)
library(effects) #dependency for ggeffects
library(MuMIn)
library(gam)
library(mgcv)
FCM <- read_csv(here("HW11", "data", "HOT_cyto_counts.csv"))
MLDS <- read_csv(here("HW11", "data", "hot_mlds.csv"))
FCM <- clean_names(FCM)
MLDS <- clean_names(MLDS)
# reformating the date and extracting the day of the year
# pressure turned negative for easier visualization later on
FCM <- FCM %>% mutate(cal_date = mdy(date)) %>%
relocate(cal_date, .after= date) %>%
mutate(doyear = yday(cal_date)) %>%
relocate(doyear, .after= cal_date) %>%
mutate(press = -1*press)
## Correction
mod <- gam(log10(pro+0.01) ~ s(doyear, press, k =50), data= FCM)
gam.check(mod)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 5 iterations.
## The RMS GCV score gradient at convergence was 1.788017e-07 .
## The Hessian was positive definite.
## Model rank = 50 / 50
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(doyear,press) 49.0 30.7 1.08 1
plot(mod, scheme = 2, shift = coef(mod)[1], trans = exp)
# making models of the microbes as a fucntion of pressure only
proxdepth <- gam(sqrt(pro) ~ s(press), data= FCM)
hbactxdepth <- gam(hbact ~ s(press), data= FCM)
picoeukxdepth <- gam(sqrt(picoeuk)~ s(press), data= FCM)
par(mfrow = c(1,3))
plot(proxdepth, main = "pro", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "green")
plot(hbactxdepth, main = "hbact", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "red")
plot(picoeukxdepth, main= "pico euk", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "royalblue")
# making models of the microbes as a fucntion of day of year only
proxday <- gam(pro ~ s(doyear), data= FCM)
hbactxday <- gam(hbact ~ s(doyear), data= FCM)
picoeukxday <- gam(sqrt(picoeuk)~ s(doyear), data= FCM)
par(mfrow = c(1,3))
plot(proxday, main = "pro", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "green")
plot(hbactxday, main = "hbact", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "red")
plot(picoeukxday, main = "pico euk", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "royalblue")
# making the comprehensive model with pressure and day of year for prochlorococcus
pro2D <- gam(sqrt(pro) ~ s(doyear, press), data= FCM)
pro2D_X <- gam(sqrt(pro) ~ s(doyear) + s(press), data= FCM)
summary(pro2D)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## sqrt(pro) ~ s(doyear, press)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.037586 0.004633 223.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(doyear,press) 27.39 28.84 375.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.899 Deviance explained = 90.1%
## GCV = 0.026751 Scale est. = 0.026127 n = 1217
anova(pro2D, pro2D_X)
## Analysis of Deviance Table
##
## Model 1: sqrt(pro) ~ s(doyear, press)
## Model 2: sqrt(pro) ~ s(doyear) + s(press)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 1187.2 31.055
## 2 1202.5 35.707 -15.339 -4.6524 11.609 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The F value of the anova is significant which means that the interaction between depth and day of year explains the variation in abundance more than the variables alone.
# making the comprehensive model with pressure and day of year for het bacteria
hbact2D <- gam(hbact ~ s(doyear, press), data= FCM)
hbact2D_X <- gam(hbact ~ s(doyear) + s(press), data= FCM)
summary(hbact2D)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## hbact ~ s(doyear, press)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.88022 0.01811 214.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(doyear,press) 22.85 26.88 123.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.732 Deviance explained = 73.7%
## GCV = 0.40709 Scale est. = 0.39911 n = 1217
anova(hbact2D, hbact2D_X)
## Analysis of Deviance Table
##
## Model 1: hbact ~ s(doyear, press)
## Model 2: hbact ~ s(doyear) + s(press)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 1189.1 476.20
## 2 1201.9 524.42 -12.83 -48.22 9.4167 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The F value of the anova is significant which means that the interaction between depth and day of year explains the variation in abundance more than the variables alone.
# making the comprehensive model with pressure and day of year for pico eukaryotes
picoeuk2D <- gam(sqrt(picoeuk) ~ s(doyear, press), data= FCM)
picoeuk2D_X <- gam(sqrt(picoeuk) ~ s(doyear) + s(press), data= FCM)
summary(picoeuk2D)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## sqrt(picoeuk) ~ s(doyear, press)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0933431 0.0006991 133.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(doyear,press) 24.31 27.73 45.82 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.514 Deviance explained = 52.3%
## GCV = 0.00060737 Scale est. = 0.00059474 n = 1217
anova(picoeuk2D,picoeuk2D_X)
## Analysis of Deviance Table
##
## Model 1: sqrt(picoeuk) ~ s(doyear, press)
## Model 2: sqrt(picoeuk) ~ s(doyear) + s(press)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 1188.3 0.70875
## 2 1200.0 0.77479 -11.746 -0.066037 9.4531 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The F value of the anova is significant which means that the interaction between depth and day of year explains the variation in abundance more than the variables alone.
par(mfrow = c(1,3))
plot(pro2D, main = "pro", residuals = TRUE, cex = 4, lwd = 3, cex.main = 3)
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter
plot(hbact2D, main = "hbact", residuals = TRUE, cex = 4, lwd = 3, cex.main = 3)
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter
plot(picoeuk2D, main = "pico euk", residuals = TRUE, cex = 4, lwd = 3, cex.main = 3)
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter
par(mfrow = c(1,3))
vis.gam(pro2D, view = c("doyear", "press"), plot.type = "contour")
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter
vis.gam(hbact2D, view = c("doyear", "press"), plot.type = "contour")
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter
vis.gam(picoeuk2D, view = c("doyear", "press"), plot.type = "contour")
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter
FCM_long <- FCM %>%
pivot_longer(cols = hbact:picoeuk,
names_to = "microbe",
values_to = "micr_conc") %>%
filter(microbe != "syn") %>%
mutate(microbe = factor(microbe, levels= c("pro", "hbact", "picoeuk")))
microbe_model <- gam(micr_conc ~ s(doyear, press, by = microbe) + microbe, data= FCM_long)
summary(microbe_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## micr_conc ~ s(doyear, press, by = microbe) + microbe
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.33514 0.01210 110.36 <2e-16 ***
## microbehbact 2.54508 0.01711 148.75 <2e-16 ***
## microbepicoeuk -1.32520 0.01711 -77.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(doyear,press):microbepro 27.1 28.78 173.502 <2e-16 ***
## s(doyear,press):microbehbact 25.8 28.39 262.887 <2e-16 ***
## s(doyear,press):microbepicoeuk 2.0 2.00 0.013 0.987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.947 Deviance explained = 94.8%
## GCV = 0.181 Scale est. = 0.17813 n = 3651
# I don't know why my picoeukaryotes plot shows a linear trend instead of a contour plot...
par(mfrow = c(1,3))
plot(microbe_model, residuals = TRUE, cex = 4, lwd = 3, cex.main = 3)
(a) Do the different kinds of microbes have different mean
abundances?
Yes, we saw that the estimated intercepts are different with the
summary(microbe_model).
(b) Do the different kinds of microbes have different average depth distributions (i.e., averaging over time)?
# we can compare the full model to a model in which only the depth (pressure) is accounted for in the variation of abundance of the microbes
microbe_depth <- gam(micr_conc ~ s(press, by = microbe) + microbe, data= FCM_long)
par(mfrow = c(1,3))
plot(microbe_depth)
# The F value of the anova is significant for pro and hbact which means that they are differently distributed on average by depth when averaging over time.
summary(microbe_depth)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## micr_conc ~ s(press, by = microbe) + microbe
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.33514 0.01326 100.72 <2e-16 ***
## microbehbact 2.54508 0.01875 135.77 <2e-16 ***
## microbepicoeuk -1.32520 0.01875 -70.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(press):microbepro 5.905 6.719 598.438 <2e-16 ***
## s(press):microbehbact 4.999 5.934 966.766 <2e-16 ***
## s(press):microbepicoeuk 1.000 1.000 0.015 0.904
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.936 Deviance explained = 93.7%
## GCV = 0.21471 Scale est. = 0.21383 n = 3651
(c) Do the different kinds of microbes have different average seasonal dynamics (i.e., averaging over depths)
# we can make a model that only accounts for the day of the year as a predictor for the variation in microbe abundance
microbe_doyear <- gam(micr_conc ~ s(doyear, by = microbe) + microbe, data= FCM_long)
par(mfrow = c(1,3))
plot(microbe_doyear)
# the F value of the anova is only significant for hbact, which means that hetero trophic bacteria are distributed differently throughout the year.
summary(microbe_doyear)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## micr_conc ~ s(doyear, by = microbe) + microbe
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.33514 0.02521 52.97 <2e-16 ***
## microbehbact 2.54508 0.03565 71.39 <2e-16 ***
## microbepicoeuk -1.32520 0.03565 -37.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(doyear):microbepro 2.077 2.581 1.922 0.131
## s(doyear):microbehbact 6.485 7.617 8.084 <2e-16 ***
## s(doyear):microbepicoeuk 1.000 1.000 0.002 0.965
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.77 Deviance explained = 77.1%
## GCV = 0.77597 Scale est. = 0.7733 n = 3651
MLDS<- MLDS %>% rename(cruise = crn)
full_df <- left_join(FCM, MLDS, by = c("cruise", "date"))
HOT <- full_df %>% select(-julian, -syn, -botid, -x1) %>%
# we only want the data of the top 45m
filter(press <=45) %>%
filter(mean <=45) %>%
pivot_longer(cols = hbact:picoeuk,
names_to = "microbe",
values_to = "micr_conc") %>%
mutate(microbe = factor(microbe, levels= c("pro", "hbact", "picoeuk"))) %>%
rename(mixlay = mean)
#ML_gam <- gam(micr_conc ~ s(press, by = microbe)+ s(doyear, mixlay, by = microbe) + microbe, data = HOT)
ML_gam <- gam(micr_conc ~ s(mixlay, by = microbe) + microbe, data = HOT)
summary(ML_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## micr_conc ~ s(mixlay, by = microbe) + microbe
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.36839 0.09561 14.31 <2e-16 ***
## microbehbact 2.47610 0.13521 18.31 <2e-16 ***
## microbepicoeuk -1.35897 0.13521 -10.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(mixlay):microbepro 1.000 1.000 0.042 0.837407
## s(mixlay):microbehbact 4.577 5.453 4.568 0.000433 ***
## s(mixlay):microbepicoeuk 1.000 1.000 0.000 0.996629
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 29/30
## R-sq.(adj) = 0.782 Deviance explained = 79%
## GCV = 0.75254 Scale est. = 0.72213 n = 237
par(mfrow = c(1,3))
plot(ML_gam)
We can see that the heterotrophic bacteria are significantly associated with the variation in mixed layer depth. There is a peak of hbact abundance around 32m and a low in hbact abundance at 39m.
chl_gam <- gam(micr_conc ~ s(chl, by = microbe) + microbe, data = HOT)
summary(chl_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## micr_conc ~ s(chl, by = microbe) + microbe
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.40333 0.09019 15.56 <2e-16 ***
## microbehbact 2.56013 0.12755 20.07 <2e-16 ***
## microbepicoeuk -1.39299 0.12755 -10.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(chl):microbepro 3.895 4.825 4.976 0.000385 ***
## s(chl):microbehbact 7.952 8.711 5.825 1.46e-06 ***
## s(chl):microbepicoeuk 1.000 1.000 0.003 0.959428
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.834 Deviance explained = 84.6%
## GCV = 0.61587 Scale est. = 0.56939 n = 210
par(mfrow = c(1,3))
plot(chl_gam)